create_graph <- function(flows) {
g <- tbl_graph(edges = flows, nodes = ccas) %>%
activate(edges) %>%
mutate(
from_name = .N()$name[from],
to_name = .N()$name[to]
) %>%
activate(nodes) %>%
# Compute centrality measures
mutate(
in_degree = centrality_degree(weights = n, mode = "in"),
out_degree = centrality_degree(weights = n, mode = "out"),
eigen = centrality_eigen(weights = n, directed = TRUE, scale = TRUE)
)
}
g2002 <- create_graph(cca_flows_2002)
g2022 <- create_graph(cca_flows_2022)
# Get the commuting network for the given year
g <- function(year) {
if (year == 2002) {
g2002
} else if (year == 2022) {
g2022
} else {
stop("Only years allowed are 2002 and 2022")
}
}
get_sf <- function(year) {
g(year) %>%
activate(nodes) %>%
as_tibble() %>%
st_as_sf()
}
# Number of nodes
g2022 %>% vcount()
## [1] 77
# Number of non-zero edges
g2022 %E>% filter(n > 0) %>% ecount()
## [1] 5656
Compute entropy measures. igraph
implements entropy with the diversity() function, but only
for undirected graphs, so we’ll have to implement in- and out-entropy
ourselves.
# A generic function to compute the in- or out-entropy value for a single node,
# given a vector of its in- or out-commuting flows.
entropy <- function(values, n = length(values)) {
normed <- values / sum(values)
# 0 values don't contribute to the sum
-sum(ifelse(normed == 0, 0, normed * log2(normed))) / log2(n)
}
# Tests. These all return TRUE.
# entropy(c(5, 5, 5), 3) == 1
# entropy(c(0,0,100), 3) == 0
# entropy(c(100), 3) == 0
# Add in-entropy and out-entropy to the network objects!
add_in_entropy <- function(g) {
if ("in_entropy" %in% colnames(g %N>% as_tibble())) {
warning("In-entropy has already been computed. Skipping.")
return(g)
}
g %>%
activate(nodes) %>%
mutate(
in_entropy = map_dbl(1:vcount(g), \(node) {
# Get all in-commuting flows to this node
g %>%
activate(edges) %>%
filter(to == node) %>%
pull(n) %>%
# Maximum number of edges to this node is 1 less than total nodes
entropy(vcount(g) - 1)
})
)
}
add_out_entropy <- function(g) {
if ("out_entropy" %in% colnames(g %N>% as_tibble())) {
warning("Out-entropy has already been computed. Skipping.")
return(g)
}
g %>%
activate(nodes) %>%
mutate(
out_entropy = map_dbl(1:vcount(g), \(node) {
# Get all out-commuting flows from this node
g %>%
activate(edges) %>%
filter(from == node) %>%
pull(n) %>%
# Maximum number of edges from this node is 1 less than total nodes
entropy(vcount(g) - 1)
})
)
}
g2002 <- g2002 %>% add_in_entropy() %>% add_out_entropy()
g2022 <- g2022 %>% add_in_entropy() %>% add_out_entropy()
I use these figures in my data section.
tract_flows <- read_csv("data/tract_flows_2022.csv", col_types = "cci")
tract_to_cca <- tracts %>% as_tibble() %>% select(GEOID, cca)
# The cca_flows data frames don't include anything outside Chicago, so we have
# to do these joins again.
tract_flows %>%
left_join(tract_to_cca, by = join_by(w_tract == GEOID)) %>%
rename(w_cca = cca) %>%
left_join(tract_to_cca, by = join_by(h_tract == GEOID)) %>%
rename(h_cca = cca) %>%
mutate(works_in_chicago = !is.na(w_cca), lives_in_chicago = !is.na(h_cca)) %>%
group_by(works_in_chicago, lives_in_chicago) %>%
summarize(n = sum(n), .groups = "drop")
## # A tibble: 4 × 3
## works_in_chicago lives_in_chicago n
## <lgl> <lgl> <int>
## 1 FALSE FALSE 3783860
## 2 FALSE TRUE 394784
## 3 TRUE FALSE 613740
## 4 TRUE TRUE 740302
set.seed(1)
g2022 %>%
activate(edges) %>%
filter(n > quantile(n, 0.6)) %>% # Keep only the biggest flows
activate(nodes) %>%
# Meters to km
mutate(distance_from_loop = distance_from_loop / 1000) %>%
ggraph(layout = "fr") +
geom_edge_link(
aes(edge_alpha = n),
edge_width = 0.2, edge_color = "#000",
) +
geom_node_point(aes(color = distance_from_loop), size = 3) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
theme_void() +
theme_graph(
plot_margin = margin(10, 10, 10, 10),
) +
theme(
legend.background = element_rect(),
legend.margin = margin(8,8,8,8),
legend.position = "inside",
legend.position.inside = c(0.95, 0.05),
legend.justification = c(1, 0),
legend.box.just = "right",
legend.box.margin = margin(6, 6, 6, 6),
) +
labs(
color = "Distance from the Loop, km",
edge_alpha = "Commuting flow"
)
## In-degree
# Higher-order function to create map plotting functions that change fill according to some variable
map_by <- function(variable, legend_label, title, scale) {
function(year) {
get_sf(year) %>%
tm_shape() +
tm_polygons(
fill = variable,
fill.legend = tm_legend(title = legend_label),
fill.scale = scale,
) +
tm_title(glue("{title} ({year})"))
}
}
map_in_degree <- map_by("in_degree", "In-commuting flow", "Inter-CCA In-commuting", tm_scale_continuous_log(values = "brewer.oranges", limits = c(100, 300000)))
map_in_degree(2002)
map_in_degree(2022)
Change in in-degree from 2002 to 2022.
vars <- c("in_degree", "out_degree", "eigen", "in_entropy", "out_entropy")
combined <- left_join(
g2002 %N>% as_tibble() %>% select(name, all_of(vars)),
# geometry comes from this one
get_sf(2022) %>% select(name, all_of(vars)),
by = "name",
suffix = c("_2002", "_2022")
) %>% st_as_sf()
# Scatter plot (not used)
combined %>%
ggplot(aes(x = in_degree_2002, in_degree_2022)) +
geom_point() +
geom_text_repel(aes(label = name)) +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(se = FALSE) +
scale_x_log10() +
scale_y_log10()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: ggrepel: 58 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
pct_change_map <- function(df, legend_title) {
df %>%
tm_shape() +
tm_polygons(
fill = "pct_change",
fill.scale = tm_scale_continuous(
values = "brewer.rd_bu", limits = c(-1, 1),
labels = c("-100%", "-50%", "0%", "+50%", "+100% or more"),
midpoint = 0,
outliers.trunc = c(TRUE, TRUE)
),
fill.legend = tm_legend(title = legend_title)
)
}
combined %>%
mutate(
change = in_degree_2022 - in_degree_2002,
pct_change = in_degree_2022 / in_degree_2002 - 1
) %>%
pct_change_map("In-degree % change")
map_out_degree <- map_by("out_degree", "Out-commuting flow", "Inter-CCA Out-commuting", tm_scale_continuous_log(values = "brewer.blues", limits = c(500, 40000)))
map_out_degree(2002)
map_out_degree(2022)
combined %>%
mutate(
change = out_degree_2022 - out_degree_2002,
pct_change = out_degree_2022 / out_degree_2002 - 1
) %>%
pct_change_map("Out-degree % change")
map_eigen <- map_by("eigen", "Eigenvector centrality", "Eigenvector Centrality",
tm_scale_continuous_log(values = "viridis", limits = c(0.0001, 1)))
map_eigen(2002)
map_eigen(2022)
combined %>%
mutate(pct_change = eigen_2022 / eigen_2002 - 1) %>%
pct_change_map("Percent change")
# Not included
cca_graph %N>%
as_tibble() %>%
mutate(area = if_else(
st_centroid(geometry) %>% st_coordinates() %>% .[, 2] >=
st_centroid(filter(ccas, name == "Near West Side")$geometry) %>%
st_coordinates() %>%
.[, 2],
"Northern",
"Southern"
)) %>%
arrange(desc(eig_centrality)) %>%
ggplot(aes(x = distance_from_loop, y = eig_centrality)) +
geom_point(aes(color = area)) +
geom_smooth(se = FALSE, method = "lm", size = 0.5, color = "black", formula = y ~ x) +
# geom_smooth(aes(color = area), se = FALSE, method = "lm", size = 0.5) +
geom_text_repel(aes(label = name), size = 2) +
scale_y_log10() +
scale_color_brewer(palette = "Set1") +
theme_classic() +
labs(
title = "Eigenvector Centrality vs. Distance from the Loop",
subtitle = '"Northern" neighborhoods are those that are at least as far north as the Near West Side.',
x = "Distance from the Loop (m)",
y = "Eigenvector Centrality",
color = "Area"
)
map_in_entropy <- map_by("in_entropy", "In-entropy", "In-entropy",
tm_scale_continuous_log(values = "brewer.reds", limits = c(0.75, 1)))
map_in_entropy(2002)
map_in_entropy(2022)
raw_change_map <- function(df, legend_title) {
df %>%
tm_shape() +
tm_polygons(
fill = "change",
fill.scale = tm_scale_continuous(
values = "brewer.rd_bu", limits = c(-0.2, 0.2),
labels = c("-0.2", "-0.1", "0", "+0.1", "+0.2"),
midpoint = 0
),
fill.legend = tm_legend(title = legend_title)
)
}
combined %>%
mutate(change = in_entropy_2022 - in_entropy_2002) %>%
raw_change_map("In-entropy change")
map_out_entropy <- map_by("out_entropy", "Out-entropy", "Out-entropy",
tm_scale_continuous_log(values = "brewer.purples", limits = c(0.4, 0.9)))
map_out_entropy(2002)
map_out_entropy(2022)
combined %>%
mutate(change = out_entropy_2022 - out_entropy_2002) %>%
raw_change_map("Out-entropy change")
df <- combined %>%
as_tibble() %>%
select(name, contains("_20")) %>%
pivot_longer(
contains("_20"),
names_to = c("variable", "year"), values_to = "value",
names_pattern = r"(^([a-z_]+)_(\d{4})$)"
) %>%
# This is the order of the variables I've been using in my paper
mutate(variable = factor(variable, levels = c("in_degree", "out_degree", "eigen", "in_entropy", "out_entropy"))) %>%
# Unpivot the years, which we want in separate columns so we can take the difference
pivot_wider(names_from = year, values_from = value, names_prefix = "value_") %>%
mutate(change = value_2022 - value_2002, pct_change = (value_2022 / value_2002 - 1) * 100) %>%
pivot_longer(
c(value_2002, value_2022, change, pct_change),
names_to = "subvariable", values_to = "value",
names_transform = . %>% str_replace("value_", "")
)
df %>%
group_by(variable, subvariable) %>%
summarize(mean = mean(value), sd = sd(value), se = sd / sqrt(77), med = median(value))
## `summarise()` has grouped output by 'variable'. You can override using the
## `.groups` argument.
## # A tibble: 20 × 6
## # Groups: variable [5]
## variable subvariable mean sd se med
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 in_degree 2002 8437. 24079. 2744. 3128
## 2 in_degree 2022 8811. 29752. 3391. 2472
## 3 in_degree change 374. 6407. 730. -364
## 4 in_degree pct_change -6.27 49.9 5.68 -16.3
## 5 out_degree 2002 8437. 6180. 704. 6970
## 6 out_degree 2022 8811. 7119. 811. 6490
## 7 out_degree change 374. 2407. 274. -127
## 8 out_degree pct_change 6.61 39.3 4.48 -1.43
## 9 eigen 2002 0.0361 0.123 0.0140 0.00991
## 10 eigen 2022 0.0336 0.128 0.0145 0.00608
## 11 eigen change -0.00251 0.0152 0.00174 -0.00292
## 12 eigen pct_change -23.8 53.4 6.09 -37.5
## 13 in_entropy 2002 0.871 0.0357 0.00407 0.865
## 14 in_entropy 2022 0.882 0.0373 0.00425 0.887
## 15 in_entropy change 0.0112 0.0274 0.00312 0.0110
## 16 in_entropy pct_change 1.33 3.22 0.367 1.22
## 17 out_entropy 2002 0.692 0.0710 0.00809 0.709
## 18 out_entropy 2022 0.638 0.0802 0.00914 0.661
## 19 out_entropy change -0.0543 0.0427 0.00486 -0.0477
## 20 out_entropy pct_change -7.88 6.40 0.729 -6.74
t.test(combined$in_degree_2022, combined$in_degree_2002, paired = TRUE)
##
## Paired t-test
##
## data: combined$in_degree_2022 and combined$in_degree_2002
## t = 0.51231, df = 76, p-value = 0.6099
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -1080.134 1828.238
## sample estimates:
## mean difference
## 374.0519
t.test(combined$out_degree_2022, combined$out_degree_2002, paired = TRUE)
##
## Paired t-test
##
## data: combined$out_degree_2022 and combined$out_degree_2002
## t = 1.3639, df = 76, p-value = 0.1766
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -172.1816 920.2854
## sample estimates:
## mean difference
## 374.0519
t.test(combined$eigen_2022, combined$eigen_2002, paired = TRUE)
##
## Paired t-test
##
## data: combined$eigen_2022 and combined$eigen_2002
## t = -1.4462, df = 76, p-value = 0.1522
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.0059705949 0.0009473825
## sample estimates:
## mean difference
## -0.002511606
t.test(combined$in_entropy_2022, combined$in_entropy_2002, paired = TRUE)
##
## Paired t-test
##
## data: combined$in_entropy_2022 and combined$in_entropy_2002
## t = 3.5885, df = 76, p-value = 0.0005863
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.004977708 0.017394526
## sample estimates:
## mean difference
## 0.01118612
t.test(combined$out_entropy_2022, combined$out_entropy_2002, paired = TRUE)
##
## Paired t-test
##
## data: combined$out_entropy_2022 and combined$out_entropy_2002
## t = -11.172, df = 76, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.06401455 -0.04464320
## sample estimates:
## mean difference
## -0.05432887
ggplot(combined, aes(x = eigen_2002)) +
geom_density()
mean_distances <- g2002 %>%
activate(edges) %>%
as_tibble() %>%
# cca_distances uses CCA names, not the same indexes as the graphs
left_join(cca_distances, by = c("from_name" = "from", "to_name" = "to")) %>%
group_by(from_name) %>%
summarize(mean_distance = sum(n * distance) / sum(n)) %>%
rename(from = "from_name")
g2002 %>%
activate(nodes) %>%
left_join(mean_distances, by = c(name = "from")) %>%
as_tibble() %>%
st_as_sf() %>%
tm_shape() +
tm_polygons(
fill = "mean_distance",
fill.scale = tm_scale_continuous(values = "viridis", limits = c(4000, 25000))
)
mean_distances2 <- g2022 %>%
activate(edges) %>%
as_tibble() %>%
# cca_distances uses CCA names, not the same indexes as the graphs
left_join(cca_distances, by = c("from_name" = "from", "to_name" = "to")) %>%
group_by(from_name) %>%
summarize(mean_distance = sum(n * distance) / sum(n)) %>%
rename(from = "from_name")
g2022 %>%
activate(nodes) %>%
left_join(mean_distances2, by = c(name = "from")) %>%
as_tibble() %>%
st_as_sf() %>%
tm_shape() +
tm_polygons(
fill = "mean_distance",
fill.scale = tm_scale_continuous(values = "viridis", limits = c(4000, 25000))
)
g(2002) %>%
activate(edges) %>%
filter(n > quantile(n, 0.75)) %>%
activate(nodes) %>%
mutate(coreness = node_coreness()) %>%
as_tibble() %>%
st_as_sf() %>%
tm_shape() +
tm_polygons(fill = "coreness", fill.scale = tm_scale_ordinal())
get_dend <- function(g) {
adj <- as_adjacency_matrix(g, attr = "n", sparse = FALSE)
# Group community areas with similar in *and* out commuting
mtx_both <- rbind(adj, t(adj))
# Perform agglomerative clustering based on pairwise Pearson's correlations
hclust(as.dist(1 - cor(mtx_both), upper = FALSE), method = "ward.D")
}
add_structural_equiv <- function(g) {
if ("cluster" %in% colnames(g %N>% as_tibble())) {
g <- g %>% select(-cluster)
}
dendrogram <- get_dend(g)
# plot(dendrogram)
# Elbow plot shows that 5 clusters is a good choice for both years
# heights <- dendrogram$height
# num_clusters <- length(heights):1
# tibble(height = heights, num_clusters = num_clusters) %>%
# filter(num_clusters <= 20) %>%
# ggplot(aes(x = num_clusters, y = height)) +
# geom_point() +
# geom_line()
cluster_assignments <- cutree(dendrogram, k = 5) %>%
tibble(cluster = ., name = names(cluster))
# Add column to graph
g %>%
activate(nodes) %>%
left_join(cluster_assignments, by = "name")
}
g2002 <- add_structural_equiv(g2002)
g2022 <- add_structural_equiv(g2022)
map_struct_equiv <- map_by("cluster", "Structural equivalence class", "Structural Equivalence Clusters", scale = tm_scale_categorical())
tmap_arrange(map_struct_equiv(2002), map_struct_equiv(2022))
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
plot(get_dend(g2002))
plot(get_dend(g2022))
mat <- as_adjacency_matrix(g2002, attr = "n", sparse = FALSE)
rege <- REGE.nm.for(mat)$E
dend <- hclust(as.dist(1 - rege), method = "ward.D")
plot(dend)
plot.mat(mat, clu=cutree(dend, k=8))
One simple result we can draw from this dataset is simply where people who live in any given commuting area work. For example, below we compare the commuting flows for residents of Hyde Park and Woodlawn. Hyde Park and Woodlawn are adjacent, but because Hyde Park is home to the University of Chicago, most of its residents work there. Woodlawn, on the other hand, has a shortage of local jobs, so most Woodlawn residents commute to the Loop.
where_do_people_work <- function(cca) {
cca_flows_2022 %>%
filter(from == cca) %>%
select(to, n) %>%
arrange(desc(n))
}
where_do_people_work_plot <- function(cca) {
# For debugging:
# cca <- "Woodlawn"
where_do_people_work(cca) %>%
filter(to != "OUTSIDE CHICAGO") %>%
mutate(is_source = to == cca) %>%
# Put the source community last so that its borders are on top (plotted last)
arrange(is_source) %>%
# Must be left_join so that OUTSIDE CHICAGO is left out
left_join(ccas, by = c("to" = "name")) %>%
# If you want to use a log scale, try this
# mutate(log_n = log10(n + 1)) %>%
st_as_sf() %>%
tm_shape() +
tm_polygons(
fill = "n",
fill.legend = tm_legend(title = "Number of Workers"),
col = "is_source",
col.scale = tm_scale_categorical(values = c(`TRUE` = "red", `FALSE` = "lightblue", `NA` = "lightblue")),
col.legend = tm_legend_hide(),
lwd = "is_source",
lwd.scale = tm_scale_categorical(values = c(`TRUE` = 2, `FALSE` = 0.5)),
lwd.legend = tm_legend_hide(),
) +
tm_title(
glue("Where do {cca} residents work?"),
size = 1.5
)
}
where_do_people_work_plot("Woodlawn")
where_do_people_work_plot("Hyde Park")
This plot shows the number of local jobs per working resident in each community area. In neighborhoods with the largest employment, like O’Hare and the Loop, there are many more jobs than working residents. By contrast, many neighborhoods have nearly ten times as many working residents as jobs, so almost everyone must commute.
ccas <- ccas %>%
mutate(jobs_availability = w_total / h_total)
ccas %>%
# filter(name != "Loop", , name != "Ohare") %>%
# mutate(pct_work_in_same = work_in_same / w_total * 100) %>%
tm_shape() +
tm_polygons(
fill = "jobs_availability",
fill.legend = tm_legend(title = "Jobs Availability"),
fill.scale = tm_scale_continuous_log10(values = "viridis"),
)
Intuitively, we would expect that commuting is higher between neighborhoods that are closer together.
cca_flows_2022 %>%
filter(from != "OUTSIDE CHICAGO", to != "OUTSIDE CHICAGO") %>%
# Zero flows don't work with the log scale
filter(n != 0) %>%
# Setting amount = 1 makes the y-scale get wacky
ggplot(aes(x = distance, y = jitter(n, amount = 0.99))) +
# geom_density2d_filled() +
geom_point(size = 1, alpha = 0.3) +
# geom_smooth(se = FALSE, method = "lm", size = 1, color = "steelblue") +
# scale_x_log10() +
scale_y_log10() +
# scale_y_continuous(limits = c(0, 100)) +
theme_minimal() +
guides(fill = "none") +
labs(
title = "Distance vs. Flow Size",
x = "Distance (m)",
y = "Flow Size"
)
Modeling this dataset with an exponential random graph model (ERGM) would help understand what factors determine where people work. However, the ERGMs we learned about in class only work with unweighted networks. Krivitsky (2012) extended the ERGM framework to networks with values that represent counts, which is what we have here.
chicago_only <- cca_graph %N>%
filter(name != "OUTSIDE CHICAGO")
net <- asNetwork(chicago_only)
# Add node covariates
net %v% "residents_log" <- log(chicago_only %N>% pull(h_in_chicago))
net %v% "workers_log" <- log(chicago_only %N>% pull(w_from_chicago))
# Network covariate: geographic distances
# Make a full distance matrix. Recall that all dyads are represented in
# cca_flows, not just non-zero edges.
D <- cca_flows_2022 %>%
filter(from != "OUTSIDE CHICAGO", to != "OUTSIDE CHICAGO") %>%
select(from, to, distance) %>%
pivot_wider(names_from = to, values_from = distance) %>%
column_to_rownames("from") %>%
as.matrix()
fit1 <- ergm(
net ~
sum + # baseline intensity
nonzero + # sparsity
nodeocov("residents_log") + # origin size effect
nodeicov("workers_log") + # destination size effect
edgecov(D), # deterrence by distance
reference = ~Poisson,
response = "n",
)